org 100h ; assume cs<=0x2a18 [0xfd..ff]=0 bx=0 sp=-2 si=0x100

%define w(xx) word[byte bp+si-0x100+xx]
%define d(xx) dword[byte bp+si-0x100+xx]

  push 0xa000   ;=68 00 a0  screen
  jmp S

C7fff dw 0x7fff

CMUL dd 0.008
BMUL dd 0.004
XMUL dd 0.0000244140625  ; 1/20480/2
YMUL dd 0.00001953125  ; 1/20480/320*256/2 = 1/25600/2
AMUL dd 40.7436654315252059568342434233637 ; 128/pi
SMUL dd 1.3333333
RMUL dw 6*256


S lds bp,[si-3] ; bp=0, ds=0x6800 texture
  mov ax,0x4f02
  mov bx,0x10e
  int 10h    ; set mode 320x200 with 65536 colors; assume it's ok (ax=0x004f)
  pop es        ; es=a000 screen
  scasw ; di=0

; generate texture: uses bp,dx,cx,si
;  xor bx,bx       ; generate 65536 texels on init
M:
T lodsb           ; load top right texel, advance si (= time)
  dec bp
  ror bp,cl
  xor dx,bp
  sar dx,3
  adc cx,dx ; ch += random (-16..16)
  adc ch,al ; ch += top right texel, or last pixel (on frame start)
  rcr cx,1        ; average ch to 0..127; cl,carry = old texel bits
  mov [si+254],ch ; store texel = (left + top right + random + adc) / 2
  dec bx          ; loop 65536 times on init, 256 times after that
  jnz T
  
  pusha

  mov cx,si
  xor bp,bp
  mov si,0x100


X mov ax,0xcccd ; convert width 320 -> 65536
  mul di
  xchg ax,bx    ; full 16-bit precision of X
  mov ax,0x4f05
  add bx,ax
  adc dx,0x9b80 ; put center at [100, 159.5]: should add 0x9b804d46

  pusha ; [-18-16-14-12-10 -8 -6 -4] on the stack
        ;   di si bp sp bx dx cx ax
        ;                  yy tt
        ;                x x

;Is it time to set the VESA bank?
  add di,di     ; di=pixel address
  jnz D
  cwd
  adc dx,dx     ; dx=page (0 or 1)
  xor bx,bx     ; bh=0 (set bank), bl=0 (window id)
  int 10h       ; assume 64kB granularity and window at 0xA000

;Compute a new color or load it from the mirror buffer.
D:
  call I        ; compute a new color
  xor ax,ax         ; R G B
  inc ax  ; ax=1
  mov cx,6
A fsqrt         ; sRGB approximation
  fimul w(C7fff)
  fistp word[bp+si] ; if it's > 0x7fff, clamp to 0x8000
  imul bx,[bp+si],2 ; double, set carry if it was > 0x3fff
  sbb bx,bp         ; overflow -> 0xffff
  xor cl,5^6        ; cl & 0x1f = shift length: flip 5<->6
  shld ax,bx,cl ; rrrrrggggggbbbbb
  jnc A ; loop 3x until you shift the 1 bit out of ax

  stosw
  popa
  inc di
  jnz X  ; di=0
  
  popa
  mov bh,2

;ESC?
  in al,60h
  cmp al,1
  jne M     ; fallthrough, exit later

I fild word[bp-8-16]
  fmul d(YMUL)
  fild word[bp-9-16]
  fmul d(XMUL)   ; x=[-9]/BIG y=[-8]/BIG

  add dh,ch
  add dh,0x80

  ; c = vec2(atan(uv.x, uv.y) / (2*Pi) + .5, RMUL/length(uv) + TMUL*iTime);

  fld st1        ; y x y
  fpatan         ; a=atan(x/y) y  ; -π..π, 0=vertical
  fld st0        ; a a y
  fcos           ; cos(a) a y
  fdivp st2,st0  ; a r=y/cos(a)  ; r=√(x²+y²)
  fmul d(AMUL)   ; A=AMUL*a r
  fistp dword[bp+si]; store A
  mov bl,[bp+si]

  fld st0        ; r r
  fidivr w(RMUL) ; RMUL/r r
  fistp dword[bp+si]  ; store R
  mov bh,[bp+si+1]
  add bh,ch


; Sun            ; r
  fmul st0
  fld1
  fsubrp st1,st0 ; 1-r²
  fmul st0
  fmul st0
  fmul st0
  fmul st0       ; s=(1-r²)¹⁶

  fld st0
  fmul d(SMUL)
  fld st0
  fmul d(SMUL)  ; 1.3²s 1.3s s [= R G B]

;  vec3 tex = vec3( fbm(MOD*c, MOD), fbm(MOD*c - .05, MOD), fbm(MOD*c - .1, MOD) );
;  vec3 col = .5 * pow(0.9 + 0.5*tex, vec3(24,16,8))

  push dx
  mov cx,3
R mov al,[bp+si]  ; linear interpolation
  not al
  mul byte[bx]
  xchg ax,dx
  inc bh
  mov al,[bp+si]
  mul byte[bx]
  add ax,dx
  shr ax,8

  mov [bp+si+2],ax
  fild word[bp+si+2]
  fmul d(CMUL)
  push cx
Q fmul st0
  loop Q
  faddp
  fstp st3
  pop cx
  loop R
  pop dx


  mov cx,3
  mov bx,dx
RZ dec bh
  mov al,[bx]
  mov [bp+si],ax
  fild word[bp+si]
  fmul d(BMUL)
  push cx
QZ fmul st0
  loop QZ
  pop cx
  faddp
  fstp st3
  loop RZ

  ret
